Preprocessing Dataset
Libraries
Library for read dataset.
library(readr)Library for data frames processing.
library(dplyr)
library(tidyr)Library for R Markdown.
library(rmarkdown)
library(knitr)Library for data presentation.
library(scales)Library for manage strings.
library(stringr)Load dataset
Set the path of the dataframe file.
path_import = "../virusTotal/data/virusTotalOriginal.csv"
path_export = "../virusTotal/data/virusTotal.csv"Load dataset.
df <- read_csv(path_import)Statistics
Dimensions.
dim(df)## [1] 183 447
Types
View witch types are in the dataset.
col_types_all <-
df %>%
sapply(typeof) %>%
unlist()
col_types_table <-
col_types_all %>%
table()
col_types <-
col_types_table %>%
as.vector()
names(col_types) <- names(col_types_table)## character double logical
## 158 204 85
As can be seen there are the three expected types: character, double and logical.
NA
Percentaje of NA values
Define function to see the amount of NA values in the dataframe.
percent_of_NA <-
function(df){
num_of_NA <-
df %>% is.na() %>% sum()
num_of_values <-
df %>% dim() %>% prod()
percent_of_NA <-
(num_of_NA / num_of_values) %>%
percent()
return(percent_of_NA)
}percent_of_NA(df)## [1] "37%"
Columns with NA
Define functions to see the NA in columns.
num_of_NA_by_column <-
function(df){
df %>% is.na() %>% colSums()
}remove_0 <-
function(x) x[x!=0]names_of_colums_with_NA <-
function(df)
df %>%
num_of_NA_by_column() %>%
remove_0 %>%
names()percentaje_of_cols_with_NA <-
function(df)
(length(names_of_colums_with_NA(df)) / ncol(df)) %>%
percent()Compute the percentaje of cols with NA.
percentaje_of_cols_with_NA(df)## [1] "74%"
Inspect if there are columns full of NA.
is_full_of_NA <- function(col){
num_of_NA <-
col %>%
is.na() %>%
sum()
return(num_of_NA == length(col))
}cols_full_of_NA <-
df %>%
select_if(is_full_of_NA) %>%
names()## [1] "authentihash" "scans.Bkav.result"
## [3] "scans.CMC.result" "scans.ALYac.result"
## [5] "scans.Malwarebytes.result" "scans.K7AntiVirus.result"
## [7] "scans.Baidu.result" "scans.SUPERAntiSpyware.result"
## [9] "scans.Gridinsoft.result" "scans.ViRobot.result"
## [11] "scans.BitDefenderTheta.result" "scans.TACHYON.result"
## [13] "scans.VBA32.result" "scans.Zoner.result"
## [15] "scans.Panda.result" "scans.Elastic.result"
## [17] "scans.Cylance.result" "scans.SentinelOne.result"
As can be seen there are many columns that are full of NA, so can be deleted.
df <-
select(df, -all_of(cols_full_of_NA))Colums with the same value
Maybe there are columns that has the same value along all the vector, so are useless.
Define function to remove these columns.
different_values <-
function(x)
x %>% na.omit() %>% unique() %>% length()remove_columns_with_the_same_value <-
function(df)
select_if(df, function(col) different_values(col) > 1)Apply function.
num_of_cols_after_remove <-
df %>%
remove_columns_with_the_same_value() %>%
ncol()Calculate the number of columns with same value.
ncol(df) - num_of_cols_after_remove## [1] 147
Awesom! Many colums found. Let’s remove them.
df <-
remove_columns_with_the_same_value(df)Inspecting dataframe
Now let’s deeply inspect into the dataframe.
View dataframe
View dataframe.
Renaming
The column “…1” is the row number, so “n” will be a better name. The “…JSON” it’s a bad name, just “json” is fine.
df <-
df %>%
rename(n = ...1, json=..JSON)Removing cols
There are many duplicated cols, hashes & dates that can be removed, also many useless.
Dates
There are many dates in the dataset, that are not relevant for virus analysis. So let’s remove them.
Define a function for check if a col is of type Date.
not <-
function(x) !x
get_element <-
function(x, index) x[index]
is_date_col <-
function(col, pattern="^[:digit:]{4}[-:/][:digit:]{2}[-:/][:digit:]{2}")
col %>%
as.character() %>%
na.omit() %>%
get_element(1) %>%
str_detect(pattern)Columns detected.
df %>%
select_if(is_date_col) %>%
head() %>%
paged_table()Define function for remove cols by a predicate.
remove_col_if <-
function(df, fun){
cols_to_delete <-
df %>%
select_if(fun) %>%
colnames()
df <-
df %>%
select(-cols_to_delete)
return(df)
}Remove them.
df <-
remove_col_if(df, is_date_col)## Note: Using an external vector in selections is ambiguous.
## ℹ Use `all_of(cols_to_delete)` instead of `cols_to_delete` to silence this message.
## ℹ See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This message is displayed once per session.
Hashes
There are many hashes cols that don’t really provide useful information. So remove them.
hashes <-
c("")
df <-
df %>%
select(
-vhash,
-sha256,
-sha1,
-scan_id,
-ssdeep,
-md5,
-additional_info.androguard.certificate.serialnumber,
-additional_info.androguard.certificate.thumbprint,
-additional_info.exiftool.ZipCRC
)Scans
There are many scans of different antivirus, that has very similar information. Just keep the scan with less NA values.
Get the best col.
scans_col_names <-
df %>%
colnames() %>%
str_match_all("scans.[:alpha:]*.result") %>%
unlist()
scan_na_by_col <-
df %>%
select(all_of(scans_col_names)) %>%
num_of_NA_by_column()
scan_col_witch_min_na <-
scan_na_by_col %>%
which.min()
best_scanner_colname <-
scan_na_by_col %>%
names() %>%
get_element(scan_col_witch_min_na)
best_scanner_col <-
df %>%
select(all_of(best_scanner_colname))
best_scanner_name <-
best_scanner_colname %>%
str_split("[.]") %>%
unlist() %>%
get_element(2)## [1] "Best scan col: scans.Fortinet.result"
## [1] "Best scanner: Fortinet"
Drop all scans but the best.
col_index_scanners <-
df %>%
colnames() %>%
str_detect("scans") %>%
unlist() %>%
which()
df <-
df %>%
select(-all_of(col_index_scanners)) %>%
cbind(best_scanner_col)Individual columns
Reasons:
- json column contains all the row as JSON.
- permalink is the URL where Virus Total has the virus file.
- Main.Activity & Package are strings with all different values.
- FileTypeExtension, ZipFileName & MIMEType has same values as FileType.
- ZipBitFlag doesn’t seems to matter.
- additional_info.magic has the vesion of the ZIP file, that doesn’t seems to matter.
- Subject.DN is the JSON fragment that has all the information about the subject, but these data are decomposed in the rest of Subject cols.
df <-
df %>%
select(
-json,
-permalink,
-additional_info.androguard.AndroidApplicationInfo,
-additional_info.androguard.Main.Activity,
-additional_info.exiftool.MIMEType,
-additional_info.exiftool.FileTypeExtension,
-additional_info.exiftool.ZipFileName,
-additional_info.magic,
-additional_info.androguard.Package,
-additional_info.androguard.certificate.Subject.DN,
-additional_info.compressedview.uncompressed_size
)Groups of columns
Define a function to remove cols which name match a pattern.
remove_cols_which_name_match <-
function(df, pattern){
cols_to_remove <-
df %>%
colnames() %>%
str_which(pattern)
df_removed_cols <-
df %>%
select(-all_of(cols_to_remove))
return(df_removed_cols)
}Remove groups.
Reasons:
- Issuer group has the same information as Subject group.
- CompressedView and RiskIndicator.APK groups have the same information as file_type group
df <-
df %>%
remove_cols_which_name_match("^additional_info.androguard.certificate.Issuer.[:alpha:]*$") %>%
remove_cols_which_name_match("^additional_info.compressedview.extensions.[:alpha:]*$") %>%
remove_cols_which_name_match("^additional_info.androguard.RiskIndicator.APK.[:alpha:]*$")View results
Define a function for sort columns.
sort_cols <-
function(df){
df <-
df %>%
select(order(colnames(df)))
additionalInfo_cols_logical <-
df %>%
colnames() %>%
str_detect("additional_info")
additionalInfo_cols <-
df %>%
select(which(additionalInfo_cols_logical))
not_additionalInfo_cols <-
df %>%
select(which(!additionalInfo_cols_logical)) %>%
select(n, size, everything())
return(cbind(not_additionalInfo_cols, additionalInfo_cols))
}Sort columns.
df <-
sort_cols(df)View results.
Replacing values
Replace “Unknown” and “?” by NA
There are some columns that hast the value “Unknown” or “?” instead of NAs. So let’s replace them.
Define a function to replace values in cols that satisfy a predicate.
replace_when <-
function(df, fun, value, replacement){
cols_to_replace <-
df %>%
select_if(fun) %>%
colnames()
df_replaced_cols <-
df %>%
select(all_of(cols_to_replace)) %>%
sapply(function(col) replace(col, which(col==value), replacement))
df_without_replaced_cols <-
df %>%
select(-all_of(cols_to_replace))
return(cbind(df_without_replaced_cols, df_replaced_cols))
}Replace ? and Unknown for NAs.
df <-
df %>%
replace_when(function(col) any(str_detect(col, fixed("Unknown"))), "Unknown", NA) %>%
replace_when(function(col) any(str_detect(col, fixed("?"))), "?", NA)Replace NA for 0
Permissions colums
The permissions columns (PERM) seems that there are NAs where there should be 0s. So it would be better to replace them.
replace_na_which_colname_match <-
function(df, pattern, replacement){
cols_to_replace <-
df %>%
colnames() %>%
str_which(pattern)
df_replaced_cols <-
df %>%
select(all_of(cols_to_replace)) %>%
sapply(function(col) replace_na(col, replacement))
df_without_replaced_cols <-
df %>%
select(-all_of(cols_to_replace))
return(cbind(df_without_replaced_cols, df_replaced_cols))
}replace_na_when <-
function(df, fun, replacement){
cols_to_replace <-
df %>%
select_if(fun) %>%
colnames()
df_replaced_cols <-
df %>%
select(all_of(cols_to_replace)) %>%
sapply(function(col) replace_na(col, replacement))
df_without_replaced_cols <-
df %>%
select(-all_of(cols_to_replace))
return(cbind(df_without_replaced_cols, df_replaced_cols))
}pattern <- "additional_info.androguard.RiskIndicator.PERM"
df <-
df %>%
replace_na_which_colname_match(pattern, 0)
df %>%
select(., str_which(colnames(.), pattern)) %>%
paged_table()New & modifiead colums
There are columns that must be only one, others provide more information by operating two columns, or summarises information.
New total permissions column
Create a new column that sums all permissions of permissions columns.
pattern <- "additional_info.androguard.RiskIndicator.PERM"
df_without_permissions <-
df %>%
select(., -(str_which(colnames(.), pattern)))
df_permissions <-
df %>%
select(., str_which(colnames(.), pattern)) %>%
mutate(., total_PERMs = rowSums(.))
df <- cbind(df_without_permissions, df_permissions)Merge both MP3 columns into one
There are two mp3 extension columns, one for .MP3 and the other for .mp3, both are mp3 files. The others columns has the name of the extension in upper case, so let’s sum both into the MP3 column.
mp3_cols_logical <-
df %>%
colnames() %>%
tolower() %>%
str_detect("mp3")
mp3_upper_colname <-
colnames(df)[which(mp3_cols_logical)] %>%
str_match("^.*MP3$") %>%
unlist() %>%
na.omit()sprintf("MP3 colname: %s", mp3_upper_colname)## [1] "MP3 colname: additional_info.compressedview.file_types.MP3"
Sum them into additional_info.compressedview.file_types.MP3
mp3_col <-
df %>%
select(which(mp3_cols_logical)) %>%
rowSums(na.rm = TRUE)
df <-
df %>%
select(-which(mp3_cols_logical)) %>%
mutate(additional_info.compressedview.file_types.MP3 = mp3_col)Express increased uncompressed size as percentage
There are two columns that refers to ZIP size, one for the compressed size and the other for the uncompressed size. It would be easier to compare them with the percentage of size increased after decompressed it.
increased_size_after_unzip <-
df$additional_info.exiftool.ZipUncompressedSize /
df$additional_info.exiftool.ZipCompressedSize
df <-
df %>%
select(-additional_info.exiftool.ZipUncompressedSize,
-additional_info.exiftool.ZipCompressedSize) %>%
mutate(additional_info.exiftool.ZipIncreasedUncompressedSize = increased_size_after_unzip)Suspicious heuristic as logical
The column additional_info.trendmicro.housecall.heuristic has a suspicious flag for some rows, the others are NA.
- So will be better to make this column logical:
- NA -> FALSE
- Suspicious -> True
- Also rename it to make it more understandable:
- additional_info.trendmicro.housecall.heuristic -> additional_info.suspicious
suspicious <-
df$additional_info.trendmicro.housecall.heuristic %>%
is.na() %>%
not()
df <-
df %>%
select(-additional_info.trendmicro.housecall.heuristic) %>%
mutate(additional_info.suspicious = suspicious)Decompose columns
The columns additional_info.f.prot.unpacker & additional_info.trid has composed values.
View results
Sort columns.
df <-
sort_cols(df)View results.
Save dataframe
After all preprocessing let’s save it into CSV.
write.csv(df, path_export)Functions for preprocessing
As factor
labels <-
function(n){
if(n == 5){
return(c("very low", "low", "medium", "high", "very high"))
}else if(n == 4){
return(c("very low", "low", "high", "very high"))
}else if(n == 3){
return(c("low", "medium", "high"))
}else if(n == 2){
return(c("low", "high"))
}else{
stop("Not avalible")
}
}
cut_by_quantiles <-
function(col){
quantiles <-
col %>%
quantile(na.rm = TRUE) %>%
unique()
if(length(quantiles) > 2){
col <-
col %>%
cut(breaks = quantiles,
labels = labels(length(quantiles)-1),
include.lowest = TRUE)
}
return(col)
}
df_cut_by_quantiles <-
function(df){
df_without_numeric <-
df[sapply(df, function(col) !is.numeric(col))]
df_numeric <-
df %>%
select_if(is.numeric)
df_numeric <-
df_numeric %>%
lapply(cut_by_quantiles)
return(cbind(df_without_numeric, df_numeric))
}